started: Alexey Larionov, 2016
last updated: Alexey Larionov, 09Feb2017

Summary

Overall, eigenvectors are calculated for 3 datasets:

This script deals with wecare-nfe-50kgen dataset

start_section

# Time stamp
Sys.time()
## [1] "2017-02-09 16:52:24 GMT"
# Folders
setwd("/scratch/medgen/scripts/wecare_stat_01.17/scripts")
interim_data_folder <- "/scratch/medgen/scripts/wecare_stat_01.17/interim_data"

# Required libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

load_data

load(paste(interim_data_folder, "r01_read_and_clean_kgen50_jan2017.RData", sep="/"))

check_data

ls()
## [1] "gt.mx"               "interim_data_folder" "kgen50_cases.df"    
## [4] "vcf.df"
dim(gt.mx)
## [1] 83881   728
class(gt.mx)
## [1] "matrix"
gt.mx[1:5, 1:5]
##                                    HG00551_kgen HG00599_kgen HG01077_kgen
## 1_871215_Var000001972_Var000000008            0            0            0
## 1_881627_Var000002076_Var000000024            2            1            0
## 1_881918_Var000002078_Var000000027            0            0            0
## 1_883918_Var000002098_Var000000030            0            0            0
## 1_889238_Var000002161_Var000000037            0            0            0
##                                    HG01102_kgen HG01269_kgen
## 1_871215_Var000001972_Var000000008            0            0
## 1_881627_Var000002076_Var000000024            1            0
## 1_881918_Var000002078_Var000000027            0            0
## 1_883918_Var000002098_Var000000030            0            0
## 1_889238_Var000002161_Var000000037            0            0
dim(vcf.df)
## [1] 83881    12
str(vcf.df)
## 'data.frame':    83881 obs. of  12 variables:
##  $ CHROM     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ POS       : int  871215 881627 881918 883918 889238 892471 894170 897133 897325 897564 ...
##  $ ID        : Factor w/ 103045 levels "esv3639875","esv3646186",..: 51096 44307 57823 20584 64525 66260 35367 35214 71110 19157 ...
##  $ REF       : Factor w/ 674 levels "A","AAAAAAAAACAAAAAAAAAAAC",..: 164 343 343 343 343 343 343 343 343 488 ...
##  $ ALT       : Factor w/ 462 levels "A","A,ACTCT",..: 234 1 1 1 1 1 1 1 125 125 ...
##  $ QUAL      : num  2492 464846 27729 1707 27859 ...
##  $ FILTER    : Factor w/ 1 level "PASS": 1 1 1 1 1 1 1 1 1 1 ...
##  $ kgenVarID : Factor w/ 103045 levels "Var000001914",..: 2 5 6 7 11 12 13 16 17 19 ...
##  $ SplitVarID: Factor w/ 103045 levels "Var000000004",..: 2 5 6 7 11 12 13 16 17 19 ...
##  $ AF        : Factor w/ 2852 levels "0.010","0.010,1.379e-03",..: 2737 652 78 1493 73 14 1427 1450 967 978 ...
##  $ AC        : Factor w/ 1613 levels "1,1","1,18","1,2",..: 353 1539 1499 919 1407 671 919 919 463 463 ...
##  $ AN        : int  1412 1512 1494 1392 1484 1424 1518 1476 1512 1494 ...
vcf.df[1:5, 1:5]
##                                    CHROM    POS          ID REF ALT
## 1_871215_Var000001972_Var000000008     1 871215  rs28419423   C   G
## 1_881627_Var000002076_Var000000024     1 881627   rs2272757   G   A
## 1_881918_Var000002078_Var000000027     1 881918  rs35471880   G   A
## 1_883918_Var000002098_Var000000030     1 883918 rs139116730   G   A
## 1_889238_Var000002161_Var000000037     1 889238   rs3828049   G   A
dim(kgen50_cases.df)
## [1] 50  4
str(kgen50_cases.df)
## 'data.frame':    50 obs. of  4 variables:
##  $ V1: Factor w/ 50 levels "HG00551","HG00599",..: 10 30 45 12 9 44 47 46 31 11 ...
##  $ V2: Factor w/ 22 levels "ASW","BEB","CDX",..: 11 4 21 11 11 21 21 21 4 11 ...
##  $ V3: Factor w/ 5 levels "AFR","AMR","EAS",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ V4: Factor w/ 1 level "female": 1 1 1 1 1 1 1 1 1 1 ...
kgen50_cases.df[1:5,]
##                   V1  V2  V3     V4
## HG01704_kgen HG01704 IBS EUR female
## NA11830_kgen NA11830 CEU EUR female
## NA20508_kgen NA20508 TSI EUR female
## HG01773_kgen HG01773 IBS EUR female
## HG01513_kgen HG01513 IBS EUR female
# Check consistence of rownames in gt.mx and vcf.df
sum(rownames(gt.mx) != rownames(vcf.df))
## [1] 0

function_to_calculate_eigenvectors

Implements procedure described by Price et al 2006 (PMID: 16862161)

normalise_and_calculate_eigenvectors.udf <- function(x) {
  
  # --- Center and normalise variants (rows) --- #
  
  # Center by mean
  avg.rows <- apply(x, 1, mean, na.rm=TRUE)
  x.c <- x - avg.rows
  
  # Normalise by sqrt(p(1-p)) where p~"posterior estimate of unobserved allele frequency"
  # This is motivated by the fact that genetic drift per generation is proportional to this normalisation value (Patterson 2006)
  # Also this makes each column to have same variance
  # 
  p.fnc <- function(x) (1 + sum(x, na.rm=TRUE)) / (2 + 2 * sum(!is.na(x)))
  p <- apply(x, 1, p.fnc)
  eaf <- sqrt(p*(1-p))
  x.cn <- x.c/eaf
  
  # Substitute NAs to zeros
  0 -> x.cn[is.na(x)]
  
  # --- Calculate eigenvectors of covariance matrix of cases --- #
  
  cov.mx <- cov(x.cn)
  eig <- eigen(cov.mx) # eigenvectors in columns
  
  return(eig)

}

calculate_and_plot_eigenvectors_full_data

# --- Calculate eigenvectors --- #

wecare_nfe_kgen50_eigen <- normalise_and_calculate_eigenvectors.udf(gt.mx)

evectors.df <- as.data.frame(wecare_nfe_kgen50_eigen$vectors) # eigenvectors in columns
evalues <- wecare_nfe_kgen50_eigen$values

# --- Prepare data for plotting --- #

# Prepare cases IDs
cases_IDs <- colnames(gt.mx)

# Prepare cases lables
kgen50_populations <- as.vector(kgen50_cases.df[colnames(gt.mx)[1:50],"V3"])
cases_labels <- c(kgen50_populations, rep("NFE",198), rep("WECARE",480))

# make the dataframe
data2plot.df <- cbind(cases_IDs, cases_labels, evectors.df[,1:3])
colnames(data2plot.df) <- c("sample", "group", "ev1", "ev2", "ev3")

# Prepare colour scale
colours <- c("EUR" = "BLUE", "AFR" = "BROWN", "AMR" = "GREEN", "SAS" = "BLACK", "EAS" = "MAGENTA", "NFE" = "PINK", "WECARE" = "RED")
userColourScale <- scale_colour_manual(values=colours)

# --- Plot eig1 vs eig2 --- #

g <- ggplot(data2plot.df, aes(-ev1, ev2)) +
  geom_point(aes(colour=group, fill=group, text = cases_IDs)) + 
  labs(title="wecare-nfe-kgen50<br>all overlapped variants (83,881 x 728)", x ="-eigenvector1", y = "eigenvector2") +
  userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
# --- Plot eig2 vs eig3 --- #

g <- ggplot(data2plot.df, aes(ev2, ev3)) +
  geom_point(aes(colour=group, fill=group, text = cases_IDs)) + 
  labs(title="wecare-nfe-kgen50<br>all overlapped variants (83,881 x 728)", x ="eigenvector2", y = "eigenvector3") +
  userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
# --- Clean-up --- #

rm(evectors.df, evalues, colours, g, data2plot.df, kgen50_populations)

calculate_and_plot_eigenvectors_common_variants

# --- Calculate AFs --- #

ac_cln <- apply(gt.mx, 1, sum, na.rm=TRUE)
get_allele_number.fnc <- function(x){2*sum(!is.na(x))}
an_cln <- apply(gt.mx, 1, get_allele_number.fnc)
af_cln <- ac_cln/an_cln

# Ceck AFs 
# (note that uniform variants were excluded)
ac_cln[1:10]
## 1_871215_Var000001972_Var000000008 1_881627_Var000002076_Var000000024 
##                                 12                                881 
## 1_881918_Var000002078_Var000000027 1_883918_Var000002098_Var000000030 
##                                 77                                  4 
## 1_889238_Var000002161_Var000000037 1_892471_Var000002190_Var000000038 
##                                 71                                 18 
## 1_894170_Var000002203_Var000000042 1_897133_Var000002219_Var000000052 
##                                  4                                  4 
## 1_897325_Var000002220_Var000000054 1_897564_Var000002224_Var000000058 
##                               1285                               1201
an_cln[1:10]
## 1_871215_Var000001972_Var000000008 1_881627_Var000002076_Var000000024 
##                               1228                               1388 
## 1_881918_Var000002078_Var000000027 1_883918_Var000002098_Var000000030 
##                               1200                               1324 
## 1_889238_Var000002161_Var000000037 1_892471_Var000002190_Var000000038 
##                               1180                               1330 
## 1_894170_Var000002203_Var000000042 1_897133_Var000002219_Var000000052 
##                               1250                               1186 
## 1_897325_Var000002220_Var000000054 1_897564_Var000002224_Var000000058 
##                               1388                               1286
af_cln[1:10]
## 1_871215_Var000001972_Var000000008 1_881627_Var000002076_Var000000024 
##                        0.009771987                        0.634726225 
## 1_881918_Var000002078_Var000000027 1_883918_Var000002098_Var000000030 
##                        0.064166667                        0.003021148 
## 1_889238_Var000002161_Var000000037 1_892471_Var000002190_Var000000038 
##                        0.060169492                        0.013533835 
## 1_894170_Var000002203_Var000000042 1_897133_Var000002219_Var000000052 
##                        0.003200000                        0.003372681 
## 1_897325_Var000002220_Var000000054 1_897564_Var000002224_Var000000058 
##                        0.925792507                        0.933903577
min(ac_cln)
## [1] 1
min(an_cln)
## [1] 1166
min(af_cln)
## [1] 0.0006877579
max(ac_cln)
## [1] 1455
max(an_cln)
## [1] 1456
max(af_cln)
## [1] 0.9993132
# Add updated AFs to vcf.df
vcf.df <- cbind(vcf.df, ac_cln, an_cln, af_cln)

# --- Exclude rare variants --- #
# Note exclusion on both sides: high- and low- AFs
# Low AFs remove rare variants with common allele in reference genome
# Hight AFs remove rare variants with common allele in reference genome

common_vars <- af_cln > 0.05 & af_cln < 0.95
sum(common_vars) # 43,563
## [1] 43563
min(af_cln[common_vars])
## [1] 0.05006954
max(af_cln[common_vars])
## [1] 0.9497717
common_gt.mx <- gt.mx[common_vars,]
common_vcf.df <- vcf.df[common_vars,]
dim(common_gt.mx)
## [1] 43563   728
dim(common_vcf.df)
## [1] 43563    15
# --- Calculate eigenvectors --- #

common_wecare_nfe_kgen50_eigen <- normalise_and_calculate_eigenvectors.udf(common_gt.mx)

common_evectors.df <- as.data.frame(common_wecare_nfe_kgen50_eigen$vectors) # eigenvectors in columns
common_evalues <- common_wecare_nfe_kgen50_eigen$values

# --- Prepare data for plotting --- #

# make the dataframe
common_data2plot.df <- cbind(cases_IDs, cases_labels, common_evectors.df[,1:3])
colnames(common_data2plot.df) <- c("sample", "group", "ev1", "ev2", "ev3")

# --- Plot eig1 vs eig2 --- #

g <- ggplot(common_data2plot.df, aes(-ev1, -ev2)) +
  geom_point(aes(colour=group, fill=group, text = cases_IDs)) + 
  labs(title="wecare-nfe-kgen50<br>overlapped common vars (43,563 x 728)", x ="-eigenvector1", y = "-eigenvector2") +
  userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
# --- Plot eig2 vs eig3 --- #

g <- ggplot(common_data2plot.df, aes(ev2, ev3)) +
  geom_point(aes(colour=group, fill=group, text = cases_IDs)) + 
  labs(title="wecare-nfe-kgen50<br>overlapped common vars (43,563 x 728)", x ="eigenvector2", y = "eigenvector3") +
  userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
# --- Clean-up --- #

rm(ac_cln, an_cln, af_cln, get_allele_number.fnc, common_vars, common_evectors.df, common_evalues, cases_IDs, cases_labels, userColourScale, g, normalise_and_calculate_eigenvectors.udf, common_data2plot.df)

compare_eigenvalues_and_eigenvectors_in_all_and_common_variants

# --- Compare eigenvalues --- #

eval_all <- wecare_nfe_kgen50_eigen$values
eval_common <- common_wecare_nfe_kgen50_eigen$values

plot(eval_all, main="Wecare-nfe-kgen50 eigenvalues (all variants)")

plot(eval_common, main="Wecare-nfe-kgen50 eigenvalues (common variants)")

plot(eval_all,eval_common, main="Wecare-nfe-kgen50 eigenvalues (all vs common variants)")

# --- Compare eigenvectors --- #

# Gather data
ev1_all <- wecare_nfe_kgen50_eigen$vectors[,1]
ev1_common <- common_wecare_nfe_kgen50_eigen$vectors[,1]
ev2_all <- wecare_nfe_kgen50_eigen$vectors[,2]
ev2_common <- common_wecare_nfe_kgen50_eigen$vectors[,2]
ev3_all <- wecare_nfe_kgen50_eigen$vectors[,3]
ev3_common <- common_wecare_nfe_kgen50_eigen$vectors[,3]

data2plot.df <- as.data.frame(cbind(ev1_all, ev2_all, ev3_all, ev1_common, ev2_common, ev3_common))

# Calculate correlations
cor.test(ev1_all, ev1_common) # 0.8278741, p-value < 2.2e-16
## 
##  Pearson's product-moment correlation
## 
## data:  ev1_all and ev1_common
## t = 39.768, df = 726, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8035493 0.8494384
## sample estimates:
##       cor 
## 0.8278741
cor.test(ev2_all, ev2_common) # -0.5837235, p-value < 2.2e-16
## 
##  Pearson's product-moment correlation
## 
## data:  ev2_all and ev2_common
## t = -19.371, df = 726, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.6296786 -0.5336974
## sample estimates:
##        cor 
## -0.5837235
cor.test(ev3_all, ev3_common) # -0.003093386, p-value = 0.9336
## 
##  Pearson's product-moment correlation
## 
## data:  ev3_all and ev3_common
## t = -0.08335, df = 726, p-value = 0.9336
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.07573931  0.06958520
## sample estimates:
##          cor 
## -0.003093386
# Common sence check (these eigenvectors should be orthogonal...)
cor.test(ev1_all, ev2_all) # 2.15652e-16, p-value = 1
## 
##  Pearson's product-moment correlation
## 
## data:  ev1_all and ev2_all
## t = 5.8106e-15, df = 726, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.07266294  0.07266294
## sample estimates:
##         cor 
## 2.15652e-16
cor.test(ev1_common, ev2_common) # 2.692919e-16, p-value = 1
## 
##  Pearson's product-moment correlation
## 
## data:  ev1_common and ev2_common
## t = 7.2559e-15, df = 726, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.07266294  0.07266294
## sample estimates:
##          cor 
## 2.692919e-16
# Make plots
g <- ggplot(data2plot.df, aes(ev1_all, ev1_common)) +
  geom_point() + 
  labs(title="Wecare-nfe-kgen50 eigenvector 1<br>all vs common variants (p=0.0003)")
ggplotly(g)
g <- ggplot(data2plot.df, aes(ev2_all, ev2_common)) +
  geom_point() + 
  labs(title="Wecare-nfe-kgen50 eigenvector 2<br>all vs common variants (p=8e-13)")
ggplotly(g)
g <- ggplot(data2plot.df, aes(ev3_all, ev3_common)) +
  geom_point() + 
  labs(title="Wecare-nfe-kgen50 eigenvector 3<br>all vs common variants (p-value = 0.2193)")
ggplotly(g)
# Clean-up
rm(eval_all, eval_common, ev1_all, ev2_all, ev3_all, ev1_common, ev2_common, ev3_common, g, data2plot.df)

export_wecare_nfe_khen50_to_EIGENSTRAT_format

Omitted

data_summary

ls()
## [1] "common_gt.mx"                   "common_vcf.df"                 
## [3] "common_wecare_nfe_kgen50_eigen" "gt.mx"                         
## [5] "interim_data_folder"            "kgen50_cases.df"               
## [7] "vcf.df"                         "wecare_nfe_kgen50_eigen"
# wecare nfe

dim(gt.mx)
## [1] 83881   728
class(gt.mx)
## [1] "matrix"
gt.mx[1:5,1:5]
##                                    HG00551_kgen HG00599_kgen HG01077_kgen
## 1_871215_Var000001972_Var000000008            0            0            0
## 1_881627_Var000002076_Var000000024            2            1            0
## 1_881918_Var000002078_Var000000027            0            0            0
## 1_883918_Var000002098_Var000000030            0            0            0
## 1_889238_Var000002161_Var000000037            0            0            0
##                                    HG01102_kgen HG01269_kgen
## 1_871215_Var000001972_Var000000008            0            0
## 1_881627_Var000002076_Var000000024            1            0
## 1_881918_Var000002078_Var000000027            0            0
## 1_883918_Var000002098_Var000000030            0            0
## 1_889238_Var000002161_Var000000037            0            0
dim(common_gt.mx)
## [1] 43563   728
class(common_gt.mx)
## [1] "matrix"
common_gt.mx[1:5,1:5]
##                                    HG00551_kgen HG00599_kgen HG01077_kgen
## 1_881627_Var000002076_Var000000024            2            1            0
## 1_881918_Var000002078_Var000000027            0            0            0
## 1_889238_Var000002161_Var000000037            0            0            0
## 1_897325_Var000002220_Var000000054            2            2            2
## 1_897564_Var000002224_Var000000058            2            2            2
##                                    HG01102_kgen HG01269_kgen
## 1_881627_Var000002076_Var000000024            1            0
## 1_881918_Var000002078_Var000000027            0            0
## 1_889238_Var000002161_Var000000037            0            0
## 1_897325_Var000002220_Var000000054            2            1
## 1_897564_Var000002224_Var000000058            2            2
dim(vcf.df)
## [1] 83881    15
str(vcf.df)
## 'data.frame':    83881 obs. of  15 variables:
##  $ CHROM     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ POS       : int  871215 881627 881918 883918 889238 892471 894170 897133 897325 897564 ...
##  $ ID        : Factor w/ 103045 levels "esv3639875","esv3646186",..: 51096 44307 57823 20584 64525 66260 35367 35214 71110 19157 ...
##  $ REF       : Factor w/ 674 levels "A","AAAAAAAAACAAAAAAAAAAAC",..: 164 343 343 343 343 343 343 343 343 488 ...
##  $ ALT       : Factor w/ 462 levels "A","A,ACTCT",..: 234 1 1 1 1 1 1 1 125 125 ...
##  $ QUAL      : num  2492 464846 27729 1707 27859 ...
##  $ FILTER    : Factor w/ 1 level "PASS": 1 1 1 1 1 1 1 1 1 1 ...
##  $ kgenVarID : Factor w/ 103045 levels "Var000001914",..: 2 5 6 7 11 12 13 16 17 19 ...
##  $ SplitVarID: Factor w/ 103045 levels "Var000000004",..: 2 5 6 7 11 12 13 16 17 19 ...
##  $ AF        : Factor w/ 2852 levels "0.010","0.010,1.379e-03",..: 2737 652 78 1493 73 14 1427 1450 967 978 ...
##  $ AC        : Factor w/ 1613 levels "1,1","1,18","1,2",..: 353 1539 1499 919 1407 671 919 919 463 463 ...
##  $ AN        : int  1412 1512 1494 1392 1484 1424 1518 1476 1512 1494 ...
##  $ ac_cln    : num  12 881 77 4 71 ...
##  $ an_cln    : num  1228 1388 1200 1324 1180 ...
##  $ af_cln    : num  0.00977 0.63473 0.06417 0.00302 0.06017 ...
vcf.df[1:5,1:5]
##                                    CHROM    POS          ID REF ALT
## 1_871215_Var000001972_Var000000008     1 871215  rs28419423   C   G
## 1_881627_Var000002076_Var000000024     1 881627   rs2272757   G   A
## 1_881918_Var000002078_Var000000027     1 881918  rs35471880   G   A
## 1_883918_Var000002098_Var000000030     1 883918 rs139116730   G   A
## 1_889238_Var000002161_Var000000037     1 889238   rs3828049   G   A
dim(common_vcf.df)
## [1] 43563    15
str(common_vcf.df)
## 'data.frame':    43563 obs. of  15 variables:
##  $ CHROM     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ POS       : int  881627 881918 889238 897325 897564 897738 900505 909309 911916 948846 ...
##  $ ID        : Factor w/ 103045 levels "esv3639875","esv3646186",..: 44307 57823 64525 71110 19157 83315 51689 64588 89632 64773 ...
##  $ REF       : Factor w/ 674 levels "A","AAAAAAAAACAAAAAAAAAAAC",..: 343 343 343 343 488 164 343 488 164 488 ...
##  $ ALT       : Factor w/ 462 levels "A","A,ACTCT",..: 1 1 1 125 125 340 125 125 340 347 ...
##  $ QUAL      : num  464846 27729 27859 991719 579086 ...
##  $ FILTER    : Factor w/ 1 level "PASS": 1 1 1 1 1 1 1 1 1 1 ...
##  $ kgenVarID : Factor w/ 103045 levels "Var000001914",..: 5 6 11 17 19 21 25 30 34 47 ...
##  $ SplitVarID: Factor w/ 103045 levels "Var000000004",..: 5 6 11 17 19 21 25 30 34 47 ...
##  $ AF        : Factor w/ 2852 levels "0.010","0.010,1.379e-03",..: 652 78 73 967 978 87 294 186 163 894 ...
##  $ AC        : Factor w/ 1613 levels "1,1","1,18","1,2",..: 1539 1499 1407 463 463 1602 912 725 659 351 ...
##  $ AN        : int  1512 1494 1484 1512 1494 1474 1482 1498 1396 1520 ...
##  $ ac_cln    : num  881 77 71 1285 1201 ...
##  $ an_cln    : num  1388 1200 1180 1388 1286 ...
##  $ af_cln    : num  0.6347 0.0642 0.0602 0.9258 0.9339 ...
common_vcf.df[1:5,1:5]
##                                    CHROM    POS         ID REF ALT
## 1_881627_Var000002076_Var000000024     1 881627  rs2272757   G   A
## 1_881918_Var000002078_Var000000027     1 881918 rs35471880   G   A
## 1_889238_Var000002161_Var000000037     1 889238  rs3828049   G   A
## 1_897325_Var000002220_Var000000054     1 897325  rs4970441   G   C
## 1_897564_Var000002224_Var000000058     1 897564 rs13303229   T   C
dim(kgen50_cases.df)
## [1] 50  4
colnames(kgen50_cases.df)
## [1] "V1" "V2" "V3" "V4"
kgen50_cases.df[1:5,]
##                   V1  V2  V3     V4
## HG01704_kgen HG01704 IBS EUR female
## NA11830_kgen NA11830 CEU EUR female
## NA20508_kgen NA20508 TSI EUR female
## HG01773_kgen HG01773 IBS EUR female
## HG01513_kgen HG01513 IBS EUR female
class(wecare_nfe_kgen50_eigen)
## [1] "list"
str(wecare_nfe_kgen50_eigen)
## List of 2
##  $ values : num [1:728] 33.91 14.48 9.36 9.07 8.83 ...
##  $ vectors: num [1:728, 1:728] -0.0218 -0.0325 -0.0428 -0.0368 -0.0204 ...
class(common_wecare_nfe_kgen50_eigen)
## [1] "list"
str(common_wecare_nfe_kgen50_eigen)
## List of 2
##  $ values : num [1:728] 18.49 10.97 9.56 5.39 4.31 ...
##  $ vectors: num [1:728, 1:728] -0.0399 -0.1615 -0.0578 -0.0428 -0.0438 ...
sum(rownames(gt.mx) != rownames(vcf.df))
## [1] 0
sum(rownames(common_gt.mx) != rownames(common_vcf.df))
## [1] 0
sum(colnames(gt.mx) != colnames(common_gt.mx))
## [1] 0

save_data

save.image(paste(interim_data_folder, "r05_calculate_egenvectors_wecare_nfe_kgen50_jan2017.RData", sep="/"))

final_section

ls()
## [1] "common_gt.mx"                   "common_vcf.df"                 
## [3] "common_wecare_nfe_kgen50_eigen" "gt.mx"                         
## [5] "interim_data_folder"            "kgen50_cases.df"               
## [7] "vcf.df"                         "wecare_nfe_kgen50_eigen"
sessionInfo()
## R version 3.2.3 (2015-12-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Scientific Linux release 6.8 (Carbon)
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] plotly_4.5.6  ggplot2_2.2.1 dplyr_0.5.0  
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.6       knitr_1.13        magrittr_1.5     
##  [4] munsell_0.4.3     viridisLite_0.1.3 colorspace_1.2-6 
##  [7] R6_2.1.2          httr_1.2.1        stringr_1.0.0    
## [10] plyr_1.8.4        tools_3.2.3       grid_3.2.3       
## [13] gtable_0.2.0      DBI_0.5           htmltools_0.3.5  
## [16] yaml_2.1.13       lazyeval_0.2.0    assertthat_0.1   
## [19] digest_0.6.10     tibble_1.1        tidyr_0.5.1      
## [22] purrr_0.2.2       formatR_1.4       base64enc_0.1-3  
## [25] htmlwidgets_0.8   evaluate_0.9      rmarkdown_1.0    
## [28] labeling_0.3      stringi_1.1.1     scales_0.4.1     
## [31] jsonlite_1.0
Sys.time()
## [1] "2017-02-09 16:53:53 GMT"